import cpuinfo
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains
import time
import pandas as pd
import cpuinfo
# 利用「CPU真實型號」去網路查詢「CPI真實架構」與「CPU虛擬架構」是否一致,
# 不一致代表Python編譯器選擇非原生CPU版本,使用Selenium會非常慢
# 需要選擇正確的Python 編譯器版本
cpu_i = cpuinfo.get_cpu_info()
print(f"CPU虛擬架構:{cpu_i['arch_string_raw']}")
print(f"CPU真實型號:{cpu_i['brand_raw']}")
CPU虛擬架構:arm64 CPU真實型號:Apple M1 Pro
1、確認Chrome版本
2、下載對應版本之ChromeDriver,注意處理器型號,網址 https://chromedriver.chromium.org/chromedriver-canary
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
# # 方法一:自己檢查自己下載,Mac可能會被擋住
# driver_file = Service('../chromedriver_mac64/chromedriver')
# driver = webdriver.Chrome(service=driver_file)
# # 測試用
# from selenium import webdriver
# from selenium.webdriver.chrome.service import Service
# driver_file = Service('../chromedriver_mac64/chromedriver') # 你下載ChromeDrive的位置
# driver = webdriver.Chrome(service=driver_file)
1、pip install webdriver-manager
# !pip install webdriver-manager
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver_file = ChromeDriverManager().install()
options = webdriver.ChromeOptions()
options.add_argument('--incognito') # 無痕模式
driver = webdriver.Chrome(service=Service(driver_file), options=options)
time.sleep(1)
driver.close()
# # 測試用
# from selenium import webdriver
# from selenium.webdriver.chrome.service import Service
# from webdriver_manager.chrome import ChromeDriverManager
# driver_file = ChromeDriverManager().install()
# driver = webdriver.Chrome(service=Service(driver_file))
Step-1: 列出要抓取的資料
Step-2: 分析Chrome中網頁原始碼的架構
Step-3: 觀察Chrome中目標資料element的特徵
Step-4: 搜索element,搜索的element的方式,非必要一律用CSS Selector
Step-5: 從element中提取資料
Note: 爬蟲會不斷重複2~5的步驟
## 小功能function
# 移動element至可見
def move_element_be_visible(driver, element):
# driver.execute_script("arguments[0].scrollIntoView();", element) # 可以做到同樣效果的另一個方法
element.location_once_scrolled_into_view
driver.execute_script('window.scrollBy(0,-150)')
# 滑鼠移動至element
def mouse_move_to(driver, element):
action = ActionChains(driver)
action.move_to_element(element)
action.perform()
# 一般模式登陸
# driver_file = ChromeDriverManager().install()
# driver = webdriver.Chrome(service=Service(driver_file))
# url = "https://www.facebook.com/groups/foodspick/?locale=zh_TW&checkpoint_src=any"
# driver.get(url)
# 手動登陸
# 無痕模式登陸
driver_file = ChromeDriverManager().install()
options = webdriver.ChromeOptions()
options.add_argument('--incognito') # 無痕模式
driver = webdriver.Chrome(service=Service(driver_file), options=options)
url = "https://www.facebook.com/groups/foodspick/?locale=zh_TW&checkpoint_src=any"
driver.get(url)
# 手動登陸
# 已載入的所有貼文(feed)
elements = driver.find_elements(By.CSS_SELECTOR, "div[class='x1yztbdb x1n2onr6 xh8yej3 x1ja2u2z']")
elements
[<selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_417")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_418")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_419")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_420")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_421")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_422")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_423")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_424")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_425")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_426")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_427")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_428")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_429")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_430")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_431")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_432")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_433")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_434")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_435")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_436")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_437")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_438")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_439")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_440")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_441")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_442")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_443")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_444")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_445")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_446")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_447")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_149")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_154")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_448")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_449")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_450")>, <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_451")>]
feed = elements[32] # 選出要抓的貼文
move_element_be_visible(driver, feed) # 將貼文移動至可見處
# 貼文發佈人
feed_man = feed.find_elements(By.CSS_SELECTOR, "h2>span[class='xt0psk2']")[0].text
print(feed_man)
傅鏡暉
# 貼文時間
locator = "div[class='xu06os2 x1ok221b'] > span[dir='auto'] > span > span> span > a[role='link'][tabindex='0']"
element = feed.find_elements(By.CSS_SELECTOR, locator)[0]
mouse_move_to(driver, element)
time.sleep(1) # 等待網頁反應
feed_time = driver.find_elements(By.CSS_SELECTOR, "div[role='tooltip']")[0].text
print(feed_time)
2024年5月3日 星期五下午9:47
# 貼文內容 -- 文字
locator = "div[dir='auto'] > div[class='x1iorvi4 x1pi30zi x1l90r2v x1swvt13'] > span[dir='auto']"
feed_content = feed.find_elements(By.CSS_SELECTOR, locator)[0].text
print(feed_content)
高CP值吃到飽自助餐,用餐空間高雅舒適,生魚片及海鮮新鮮、有蒸生蠔、牛排、干貝蟹肉煲、炸蝦、鴨肝醬等百種美食及哈根達斯冰淇淋。現正舉辦甜點季,自即日起到5/17,請到日本米其林甜點主廚製作10款巧克力甜點。
# 貼文內容 -- 連結
element = feed.find_elements(By.CSS_SELECTOR, "div[class='xmjcpbm x1n2onr6'] > div > a[rel='nofollow noreferrer'][role='link'][tabindex='0']")[0]
feed_link = element.get_attribute('href')
feed_link
'https://lordcat.tw/archives/141343'
# 貼文內容 -- 連結文字
element = feed.find_elements(By.CSS_SELECTOR, "span[dir='auto'] > span > span[dir='auto']")[0]
feed_link_alt = element.text
feed_link_alt
'高CP值吃到飽自助餐,生魚片、蒸生蠔、牛排、干貝蟹肉煲等百種美食及哈根達斯'
# 貼文內容 -- 照片
feed_pic = feed.find_elements(By.CSS_SELECTOR, "div[class='x10l6tqk x13vifvy'] > img")[0].get_attribute('src')
feed_pic
'https://external.ftpe8-3.fna.fbcdn.net/emg1/v/t13/13706082881180517361?url=https%3A%2F%2Flordcat.tw%2Fwp-content%2Fuploads%2F2024%2F04%2F1714218163-a6f0166bd53bdcb983763ed0318a1f96.jpg&fb_obo=1&utld=lordcat.tw&stp=c0.5000x0.5000f_dst-jpg_flffffff_p1000x522_q75&ccb=13-1&oh=06_Q39951lI8udLAh0424JzpsIefLv6N5LrSkU_Q4cAoPFrgro&oe=663BF857&_nc_sid=085657'
# 按讚數
element = feed.find_elements(By.CSS_SELECTOR, "span[class='x1e558r4']")[0]
emotion = element.text
emotion
'33'
# 留言數
element = feed.find_elements(By.CSS_SELECTOR, "div > span >div[role='button'][tabindex='0'] > div > div > span[dir='auto']")[0]
comment_number = element.text
comment_number
'9'
# 轉發數
element = feed.find_elements(By.CSS_SELECTOR, "div > span >div[role='button'][tabindex='0'] > div > div > span[dir='auto']")[1]
comment_number = element.text
comment_number
'2'
# 留言人
element = feed.find_elements(By.CSS_SELECTOR, "span[class='x3nfvp2']")[0]
comment_man = element.text
comment_man
'吳莉莉'
# 留言時間
element = feed.find_elements(By.CSS_SELECTOR, "div[class='x6s0dn4 x3nfvp2']")[0]
mouse_move_to(driver, element)
time.sleep(1) # 等待網頁反應
comment_time = driver.find_elements(By.CSS_SELECTOR, "div[role='tooltip']")[0].text
print(comment_time)
2024年5月3日 星期五下午11:08
# 留言內容
element = feed.find_elements(By.CSS_SELECTOR, "div[class='x1lliihq xjkvuk6 x1iorvi4']")[0]
comment_content = element.text
comment_content
'母親節可以來這裡了'
# 資訊整理並輸出CSV
df_feeds = pd.DataFrame({"Feed Number":[1],
"Feed Man":[feed_man],
"Feed Time":[feed_time],
"Feed Content":[feed_content],
"Feed Content Picture":[feed_pic],
"Feed Link String":[feed_link_alt],
"Feed Link":[feed_link],
"Comment Man":[comment_man],
"Comment Time":[comment_time],
"Comment Content":[comment_content]})
df_feeds.to_csv("../feeds.csv", encoding="utf-8-sig", index=False)
Feed Number | Feed Man | Feed Time | Feed Content | Feed Content Picture | Feed Link String | Feed Link | Comment Man | Comment Time | Comment Content | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 傅鏡暉 | 2024年5月3日 星期五下午11:08 | 高CP值吃到飽自助餐,用餐空間高雅舒適,生魚片及海鮮新鮮、有蒸生蠔、牛排、干貝蟹肉煲、炸蝦、... | https://external.ftpe8-3.fna.fbcdn.net/emg1/v/... | 高CP值吃到飽自助餐,生魚片、蒸生蠔、牛排、干貝蟹肉煲等百種美食及哈根達斯 | https://lordcat.tw/archives/141343 | 吳莉莉 | 2024年5月3日 星期五下午11:08 | 母親節可以來這裡了 |
driver.close()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[1], line 1 ----> 1 driver.close() NameError: name 'driver' is not defined